home *** CD-ROM | disk | FTP | other *** search
Text File | 1987-11-03 | 6.6 KB | 162 lines | [TEXT/KAHL] |
- /* revised main indexer program 'qndxr.c' by ^z -- 870820-...
- *
- * revised 870930-1002 to give the user more options and control of how
- * delimiters are defined and handled. Now can choose:
- * - the default: any non-alphanumeric characters are delimiters (my
- * original approach, and perhaps the simplest to understand and use);
- * - option "-e": keep punctuation characters ONLY when they are embedded
- * in between non-delimiter characters, otherwise turn them into
- * delimiters (approach invented by Bill Hole of MicroDynamics Ltd.;
- * very nice for displaying 'words' such as don't, non-ASCII, 3.14159,
- * 87/10/02, etc. without splitting them up and requiring proximity
- * searching);
- * - option "-a": keep ALL punctuation characters, whether embedded in
- * words or not (best for indexing FORTH programs and such, but does
- * make spurious 'distinct' words at ends of sentences, etc.);
- * - option "-s": keep all special (non-ASCII) characters (with values
- * in the range 128-255; in the Apple character set, these are the
- * symbols that carry umlauts, tilde, accents, etc., making this
- * option the best for foreign-language and symbol-heavy files);
- * default is to remove the special characters.
- *
- * Note that option "-s" can be combined with any of the other options;
- * options "-e" and "-a" are mutually exclusive. (Actually, "-a" in my
- * implementation overrides "-e".) The "-e" option does require
- * peeking ahead one character before deciding on the fate of
- * a punctuation symbol, but that isn't too hard to arrange....
- *
- * ---------------------------------------------------------------
- *
- * Synopsis of the qndxr.c program's approach to sorting an index:
- * - load a chunk of the input file into memory; during the loading,
- * filter the characters appropriately, turning lower-case
- * into capitals, changing delimiters into \0's, and noting down
- * the locations of the beginnings of keys (words) in a ptr array;
- * - do a quicksort on the ptr array, using the keys in the buffer
- * to sort on;
- * - write the resulting sorted list of pointers along with their keys
- * to disk as files, named x0k0 and x0p0, in the *.k and *.p formats
- * used in ndxr.15;
- * - repeat the above steps with another chunk of the input file, and
- * name the resulting files x0k1 and x0p1; repeat, etc., until the
- * input file is all sorted into chunks;
- * - begin to merge the sorted files in an N-way merge ... for the
- * specific case of N=2, for example, merge x0k0/x0p0 and
- * x0k1/x0p1 into x1k0/x1p0; merge x0k2/x0p2 and x0k3/x0p3 into
- * x1k1/x1p1; etc., deleting the 0th-generation files as they are
- * merged;
- * - merge x1k0/x1p0 and x1k1/x1p1 into x2k0/x2p0, etc., and continue
- * merging subsequent generations until everything has become a
- * single pair of index files, xnk0/xnp0, which is then renamed
- * to be the original document name with '.k' and '.p' endings.
- *
- * ---------------------------------------------------------------
- *
- * Comparison with the older radix sort approach:
- * The new quicksort/mergesort method gains a bit in speed (since so
- * much of the work is done in memory rather than streaming from disk
- * back and forth) and also saves on disk space requirements (since
- * the k and p files are significantly compressed relative to the raw
- * index files formerly used during index sorting). The old radix sort
- * tended to only achieve about 2 MB/hour indexing on my Mac Plus setup,
- * and it required 5-6 times the size of the original file in free
- * disk space during indexing. This new approach achieves >4 MB/hour
- * in tests on the same Mac Plus, and only requires about 1.9 times
- * the original file size in free space!
- *
- * The new approach also allows for greater key length -- try recompiling
- * with KEY_LENGTH of 28, for instance -- without such severe disk space
- * penalties as the old radix sort would have incurred, and without severe
- * time penalties. (Duplicate words in the chunks are only stored once
- * in the key file, though each must have an entry in the pointer file.)
- *
- * The only obvious disadvantage of the quicksort/mergesort approach is
- * that it is an O(NlogN) procedure rather than O(N), and thus gets slower
- * when files get very large. (Radix sorting is strictly linear in N,
- * in theory anyway.)
- *
- * ---------------------------------------------------------------
- *
- * For further details, contact the author:
- * Mark Zimmermann science (at) NEMS.ARPA
- * 9511 Gwyndale Dr. [75066,2044] on CompuServe
- * Silver Spring, MD 20910
- * 301-565-2166
- * ---------------------------------------------------------------
- */
-
- #include <stdio.h>
- #include <unix.h>
- #include <storage.h>
- #include <strings.h>
- #include <ctype.h>
- #include <proto.h>
- #include "qndxr.2.h"
-
- /* define a global variable to hold the chosen size of a fundamental
- * quantum of buffer space ... experiments with dynamically choosing
- * it at runtime seem to cause occasional problems on the Macintosh,
- * and have other risks with virtual memory systems, so default to
- * DEFAULT_MEMSIZ total buffer space but let the user override with
- * a command-line choice to suit ... see function set_zbufsiz() for
- * further discussion....
- */
- long zbufsiz;
-
- /* define a global variable to tell whether or not all punctuation chars
- * are kept...
- */
- int keep_all_punct = FALSE;
-
- /* define a global variable to tell whether or not only embedded punctuation
- * characters are preserved...
- */
- int keep_embedded_punct = FALSE;
-
- /* define a global variable to tell whether or not to keep special,
- * non-ASCII characters...
- */
- int keep_special_chars = FALSE;
-
- /* define a global variable to hold the (FILE *) for the document file
- */
- FILE *doc_file;
-
- /* main program to do the work ...
- */
-
- void _main(argc, argv)
- int argc;
- char *argv[];
- {
- unsigned long start_time, end_time, time();
- long set_params(), file_size();
- char *ctime();
-
- time (&start_time);
- printf ("Starting work: %s\n", ctime (&start_time));
-
- printf ("\nOpening document file \"%s\"...\n", argv[1]);
- doc_file = open_docfile (argc, argv);
-
- zbufsiz = set_params (argc, argv);
- printf ("Using buffers of size %ld each...\n", zbufsiz);
-
- printf ("Beginning to build index...\n");
- while (build_indices ());
-
- printf ("Beginning to merge index subfiles...\n");
- while (merge_indices (argv[1]));
-
- time (&end_time);
- printf ("\nEnding work: %s\n", ctime (&end_time));
- printf ("Elapsed time = %ld seconds.\n", end_time - start_time);
- printf ("Indexing rate = %.2f megabytes/hour.\n", file_size (doc_file) *
- 0.003600 / ( end_time - start_time ));
- printf ("Input database file \"%s\" was %ld bytes long.\n", argv[1],
- file_size (doc_file));
-
- fclose (doc_file);
- }
-
-